import sklearn
#pip install scikit-learn==1.4.2
print(sklearn.__version__)
powinno byc 1.2.2 (inaczej moze nie dzialac :( )
import pandas as pd
import numpy as np
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import plotly.express as px
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
warnings.filterwarnings('ignore')
data = pd.read_csv('../Data/our_data.csv')
X = data.drop('Class', axis=1)
y = data['Class']
X_train, X_val, y_train, y_val = train_test_split(
X, y, stratify=y, test_size=0.3, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
X_val, y_val, stratify=y_val, test_size=0.3, random_state=42
)
X_train = X_train.drop(['Compactness','EquivDiameter', 'Area'], axis=1)
X_val = X_val.drop(['Compactness','EquivDiameter','Area'], axis=1)
X_test = X_test.drop(['Compactness','EquivDiameter','Area'], axis=1)
cols = X_train.columns
scaling = sklearn.preprocessing.PowerTransformer(method='box-cox')
X_train = scaling.fit_transform(X_train)
X_test = scaling.transform(X_test)
X_val = scaling.transform(X_val)
X_train = pd.DataFrame(X_train, columns=cols)
X_test = pd.DataFrame(X_test, columns=cols)
X_val = pd.DataFrame(X_val, columns=cols)
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(y_train.to_frame())
y_encoded = pd.DataFrame(enc.transform(y_train.to_frame()).toarray(),columns=enc.get_feature_names_out(['Class']))
y_val_encoded = pd.DataFrame(enc.transform(y_val.to_frame()).toarray(),columns=enc.get_feature_names_out(['Class']))
y_test_encoded = pd.DataFrame(enc.transform(y_test.to_frame()).toarray(),columns=enc.get_feature_names_out(['Class']))
#standard encoding 0,1,2,...
labelencoder = sklearn.preprocessing.LabelEncoder()
y_encoded2 = pd.DataFrame(labelencoder.fit_transform(y_train))
y_val_encoded2 = pd.DataFrame(labelencoder.fit_transform(y_val))
y_test_encoded2 = pd.DataFrame( labelencoder.fit_transform(y_test))
X_train.columns
class_names = ['BARBUNYA', 'BOMBAY', 'CALI', 'DERMASON', 'HOROZ', 'SEKER', 'SIRA']
def hyperparameters_tuner(estimator, param_distributions, X, y, cv=5, n_iter=10, random_state=42):
random_search = RandomizedSearchCV(estimator, param_distributions=param_distributions, n_iter=n_iter, cv=cv, random_state=random_state)
random_search.fit(X, y)
return random_search.best_params_
def plot_confusion_matrix (y_true, y_pred, class_names):
cm = confusion_matrix(y_true, y_pred, labels=class_names)
fig = px.imshow(cm, labels=dict(x="Predicted", y="Actual", color="Count",text_auto='.2f'), x=class_names, y=class_names)
fig.update_xaxes(side="top")
for i in range(len(class_names)):
for j in range(len(class_names)):
fig.add_annotation(x=class_names[j], y=class_names[i], text=str(cm[i, j]), showarrow=False,font=dict(color="black" if cm[i, j] > cm.max()/2 else "white"))
fig.show()
def train_evaluate_encoded2(estimator, param_distributions, X_train, y_train, X_val, y_val, cv=5, class_names=['BARBUNYA', 'BOMBAY', 'CALI', 'DERMASON', 'HOROZ', 'SEKER', 'SIRA']):
best_params = hyperparameters_tuner(estimator, param_distributions, X_train, y_train)
best_model = estimator.set_params(**best_params)
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
cv_results = cross_val_score(best_model, X_train, y_train, cv=cv)
cv_val_results = cross_val_score(best_model, X_val, y_val, cv=cv)
y_pred = labelencoder.inverse_transform(y_pred)
y_val = labelencoder.inverse_transform(y_val)
print(f"Best parameters: {best_params}")
print('__________________________________________________________')
print(f"Accuracy: {accuracy}")
plot_confusion_matrix(y_val, y_pred, class_names)
print(classification_report(y_val, y_pred, target_names=['BARBUNYA', 'BOMBAY', 'CALI', 'DERMASON', 'HOROZ', 'SEKER', 'SIRA']))
print("__________________________________________________________")
print(f"Cross-validation results: {cv_results}")
print(f"Mean accuracy: {cv_results.mean()}")
print(f"Cross-validation results on validation set: {cv_val_results}")
print(f"Mean accuracy on validation set: {cv_val_results.mean()}")
return best_params, best_model
def train_evaluate(estimator, param_distributions, X_train, y_train, X_val, y_val, cv=5, class_names=['BARBUNYA', 'BOMBAY', 'CALI', 'DERMASON', 'HOROZ', 'SEKER', 'SIRA']):
best_params = hyperparameters_tuner(estimator, param_distributions, X_train, y_train)
best_model = estimator.set_params(**best_params)
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
cv_results = cross_val_score(best_model, X_train, y_train, cv=cv)
cv_val_results = cross_val_score(best_model, X_val, y_val, cv=cv)
print(f"Best parameters: {best_params}")
print('__________________________________________________________')
print(f"Accuracy: {accuracy}")
plot_confusion_matrix(y_val, y_pred, class_names)
print(classification_report(y_val, y_pred, target_names=['BARBUNYA', 'BOMBAY', 'CALI', 'DERMASON', 'HOROZ', 'SEKER', 'SIRA']))
print("__________________________________________________________")
print(f"Cross-validation results: {cv_results}")
print(f"Mean accuracy: {cv_results.mean()}")
print(f"Cross-validation results on validation set: {cv_val_results}")
print(f"Mean accuracy on validation set: {cv_val_results.mean()}")
return best_params, best_model
z autoML tpot był robiony i prezentacji jest ale się z kodu usunął, zasugerowany model to był MLP coś tam
dist = dict(C=[10 ** x for x in range(-4, 3)], penalty=['l2', 'l1'])
lr = LogisticRegression(max_iter=1000,solver='saga', multi_class='multinomial')
lr_best_params, lr_best = train_evaluate_encoded2(lr, dist, X_train, y_encoded2, X_val, y_val_encoded2, cv=5)
model dziala dobrze, crossvalidacja nie wykrywa over ani underfittingu
data = pd.read_csv('../Data/our_data.csv')
X = data.drop('Class', axis=1)
y = data['Class']
X_train, X_val, y_train, y_val = train_test_split(
X, y, stratify=y, test_size=0.3, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
X_val, y_val, stratify=y_val, test_size=0.3, random_state=42
)
X_train_filtered = X_train.drop(['Compactness','EquivDiameter', 'Area'], axis=1)
X_val_filtered = X_val.drop(['Compactness','EquivDiameter','Area'], axis=1)
X_test_filtered = X_test.drop(['Compactness','EquivDiameter','Area'], axis=1)
cols = X_train.columns
scaling = sklearn.preprocessing.StandardScaler()
X_train = scaling.fit_transform(X_train)
X_test = scaling.transform(X_test)
X_val = scaling.transform(X_val)
X_train = pd.DataFrame(X_train, columns=cols)
X_test = pd.DataFrame(X_test, columns=cols)
X_val = pd.DataFrame(X_val, columns=cols)
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(y_train.to_frame())
y_encoded = pd.DataFrame(enc.transform(y_train.to_frame()).toarray(),columns=enc.get_feature_names_out(['Class']))
y_val_encoded = pd.DataFrame(enc.transform(y_val.to_frame()).toarray(),columns=enc.get_feature_names_out(['Class']))
y_test_encoded = pd.DataFrame(enc.transform(y_test.to_frame()).toarray(),columns=enc.get_feature_names_out(['Class']))
#standard encoding 0,1,2,...
labelencoder = sklearn.preprocessing.LabelEncoder()
y_encoded2 = pd.DataFrame(labelencoder.fit_transform(y_train))
y_val_encoded2 = pd.DataFrame(labelencoder.fit_transform(y_val))
y_test_encoded2 = pd.DataFrame( labelencoder.fit_transform(y_test))
dist = dict(n_estimators=[5, 10, 25, 50, 100, 200, 250, 500, 1000],
criterion=['gini', 'entropy', 'log_loss'],
max_depth=[1, 5, 10, 25, 50, 100, 150],
min_samples_split=[1, 5, 10, 25, 50, 100, 250, 500])
rf = RandomForestClassifier(random_state=42)
rf_best_params, rf_best = train_evaluate_encoded2(rf, dist, X_train, y_encoded2, X_val, y_val_encoded2, cv=5)
svc = SVC()
dist = {'C': [0.1, 1, 10, 100, 1000,10000], 'gamma': [1, 0.01, 0.0001], 'kernel': ['rbf'
, 'poly', 'sigmoid'
]}
svc_best_params, svc_best = train_evaluate_encoded2(svc, dist, X_train, y_encoded2, X_val, y_val_encoded2, cv=5)
nb = GaussianNB()
dist = {'var_smoothing': np.logspace(0,-9, num=100)}
nb_best_params, nb_best= train_evaluate_encoded2(nb, dist, X_train, y_encoded2, X_val, y_val_encoded2, cv=5)
Słabo ale co zrobic, nie bedziemy dalej uzywac
dt = DecisionTreeClassifier()
dist = dict(criterion=['gini', 'entropy', 'log_loss'],
max_depth=[1, 5, 7, 9, 10,11,15, 20, 50, 75, 100],
min_samples_split=[1,3,4,5,7, 10, 100, 250, 500])
dt_best_params, dt_best = train_evaluate_encoded2(dt, dist, X_train, y_encoded2, X_val, y_val_encoded2, cv=5)
from sklearn.neighbors import KNeighborsClassifier
dist = {'n_neighbors': [3, 5,7,8, 9, 10,11, 12, 15, 21]}
kn = KNeighborsClassifier()
#nie działa mi, może przez sklearn, mam 1.4.2 ale nie zmieniam na inny już w trakcie bo reszta oprócz stackingu też działa
kn_best_params, kn_best = train_evaluate_encoded2(kn, dist, X_train, y_encoded2, X_val, y_val_encoded2, cv=5)
from xgboost import XGBClassifier
xgb = XGBClassifier(subsample=0.8, nestimators=300, ma_depth=7, learning_rate=0.1, gamma=0.1, colsample_bytree=0.5)
param_grid = {
'n_estimators': [100, 200, 300], # Number of boosting rounds
'max_depth': [3, 5, 7], # Maximum depth of the tree
'learning_rate': [0.01, 0.1, 0.3], # Step size shrinkage used in update to prevent overfitting
'subsample': [0.5, 0.8, 1.0], # Subsample ratio of the training instance
'colsample_bytree': [0.5, 0.8, 1.0], # Subsample ratio of columns when constructing each tree
'gamma': [0, 0.1, 0.2] # Minimum loss reduction required to make a further partition on a leaf node of the tree
}
xgb_best_params, xgb_best = train_evaluate_encoded2(xgb, param_grid, X_train, y_encoded2, X_val, y_val_encoded2, cv=5)
from sklearn.ensemble import StackingClassifier
lr = LogisticRegression(C=100, penalty='l1', max_iter=1000,solver='saga', multi_class='multinomial')
svc = SVC(kernel= 'rbf', gamma= 0.01, C= 10000)
dt = DecisionTreeClassifier(min_samples_split= 10, max_depth= 15, criterion ='entropy')
nb = GaussianNB(var_smoothing= 2.848035868435799e-08)
kn = KNeighborsClassifier(n_neighbors= 9)
rf = RandomForestClassifier(n_estimators= 200, min_samples_split= 5, max_depth = 25, criterion= "log_loss")
models = [
('lr', lr),
('svc', svc), ('nb', nb), ('rf', rf), ('dt', dt), ('kn', kn)]
stack = StackingClassifier(estimators=models, final_estimator=LogisticRegression( max_iter=1000,solver='saga', multi_class='multinomial'))
stack.fit(X_train, y_encoded2)
y_pred = stack.predict(X_val)
y_val = labelencoder.inverse_transform(y_val_encoded2)
y_pred = labelencoder.inverse_transform(y_pred)
print(classification_report(y_val, y_pred, target_names=['BARBUNYA', 'BOMBAY', 'CALI', 'DERMASON', 'HOROZ', 'SEKER', 'SIRA']))
plot_confusion_matrix(y_val, y_pred, class_names)
train_evaluate_encoded2(stack, {}, X_train, y_encoded2, X_val, y_val_encoded2, cv=5)
#pip install pdpbox
#pip install lime
import lime
from lime import lime_tabular
import shap
from sklearn.tree import export_graphviz
import graphviz
from pdpbox import pdp, info_plots
import graphviz
from sklearn.inspection import PartialDependenceDisplay
from sklearn import tree
def visualize_model_lime(best_model, X_train, X_val, class_names, sample_idx):
# Create a LIME explainer
explainer = lime.lime_tabular.LimeTabularExplainer(X_train.values,
feature_names=X_train.columns.tolist(),
class_names=class_names,
discretize_continuous=True)
# Explain the prediction for a sample from the validation set
exp = explainer.explain_instance(X_val.values[sample_idx],
best_model.predict_proba,
num_features=len(X_train.columns),
top_labels=len(class_names))
# Show the explanation
exp.show_in_notebook()
def visualize_model_shap(best_model, X_train, X_val, class_names):
# Visualize using SHAP
shap.initjs()
explainer_shap = shap.TreeExplainer(best_model)
shap_values = explainer_shap.shap_values(X_val)
shap.summary_plot(shap_values, X_val, feature_names=X_train.columns, class_names=class_names)
def visualize_model_tree(best_model, X_train, X_val, class_names):
#visualize tree
tree1 = best_model
feature_names = X_train.columns
dot_data = tree.export_graphviz(tree1,
feature_names=feature_names,
class_names=class_names,
filled=True, rounded=True,
special_characters=True,
out_file=None,
)
graph = graphviz.Source(dot_data)
graph.format = "png"
graph.render("tree")
graph.view()
#to chyba nie działa ale nie wiem jak ma działać, można przekleić z tego co tam robiłeś wcześniej jakoś to drzewo
def visualize_model_partial_dependence(best_model, X_train, features, target_class, class_names):
display = PartialDependenceDisplay.from_estimator(best_model, X_train, features, target=target_class)
display.plot()
for index in range(7):
visualize_model_lime(stack, X_train, X_val, class_names, index)
for index in range(7):
visualize_model_lime(kn_best, X_train, X_val, class_names, index)
visualize_model_shap(xgb_best, X_train, X_val, class_names)
class_names = ['BARBUNYA', 'BOMBAY', 'CALI', 'DERMASON', 'HOROZ', 'SEKER', 'SIRA']
features = ['Compactness', 'ShapeFactor1', ('Compactness', 'ShapeFactor1')]
for target_class in range(2):
visualize_model_partial_dependence(xgb_best, X_train, features, target_class, class_names)
features = ['Area', 'Perimeter', ('Area','Perimeter')]
for target_class in range(7):
visualize_model_partial_dependence(xgb_best, X_train, features, target_class, class_names)
nie wiem dlaczego powyżej się zrobiło wszystko to samo
for index in range(7):
visualize_model_lime(xgb_best, X_train, X_val, class_names, index)
from sklearn.metrics import roc_curve, auc
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import label_binarize
# Convert labels to binary format
y_val_binarized = label_binarize(y_val, classes=np.unique(y_val))
n_classes = y_val_binarized.shape[1]
# Initialize subplots
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(15, 10))
models = [xgb_best.predict_proba(X_val), #lr_best.predict_proba(X_val),
rf_best.predict_proba(X_val), nb_best.predict_proba(X_val), dt_best.predict_proba(X_val), stack.predict_proba(X_val), kn_best.predict_proba(X_val)]
model_names = ['XGBoost Classifier', #'Logistic Regression',
'Random Forest', 'Naive Bayes', 'Decision Tree', 'Stacking Classifier', 'KNeighbors Classifier']
for idx, (model, model_name) in enumerate(zip(models, model_names)):
row = idx // 2
col = idx % 2
ax = axes[row, col]
for i in range(n_classes):
fpr, tpr, _ = roc_curve(y_val_binarized[:, i], model[:, i])
roc_auc = auc(fpr, tpr)
ax.plot(fpr, tpr, lw=2, label=f'{class_names[i]} (AUC = {roc_auc:.2f})')
ax.plot([0, 1], [0, 1], color='gray', lw=1, linestyle='--')
ax.set_xlim([0.0, 1])
ax.set_ylim([0.0, 1.05])
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title(f'ROC Curve for {model_name}')
ax.legend(loc='lower right')
ax.grid(True)
plt.tight_layout()
plt.show()
# Convert labels to binary format
y_val_binarized = label_binarize(y_val, classes=np.unique(y_val))
n_classes = y_val_binarized.shape[1]
# Initialize subplots
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(15, 10))
models = [xgb_best.predict_proba(X_val),#lr_best.predict_proba(X_val),
rf_best.predict_proba(X_val), nb_best.predict_proba(X_val), dt_best.predict_proba(X_val), stack.predict_proba(X_val), kn_best.predict_proba(X_val)]
model_names = ['XGBoost Classifier', #'Logistic Regression',
'Random Forest', 'Naive Bayes', 'Decision Tree', 'Stacking Classifier', 'KNeighbors Classifier']
for idx, (model, model_name) in enumerate(zip(models, model_names)):
row = idx // 2
col = idx % 2
ax = axes[row, col]
for i in range(n_classes):
fpr, tpr, _ = roc_curve(y_val_binarized[:, i], model[:, i])
roc_auc = auc(fpr, tpr)
ax.plot(fpr, tpr, lw=2, label=f'{class_names[i]} (AUC = {roc_auc:.2f})')
ax.plot([0, 1], [0, 1], color='gray', lw=1, linestyle='--')
ax.set_xlim([0.0, 0.14])
ax.set_ylim([0.0, 1.05])
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title(f'ROC Curve for {model_name}')
ax.legend(loc='lower right')
ax.grid(True)
plt.tight_layout()
plt.show()
moe nie diałać na windows, jest plik tree.png - mozna sobie obejrzec
#visualize_model_tree(dt_best, X_train, X_val, class_names)
#acuracy of stacking_best on test set
y_pred = stack.predict(X_test)
y_test = labelencoder.inverse_transform(y_test_encoded2)
y_pred = labelencoder.inverse_transform(y_pred)
print(accuracy_score(y_test, y_pred) )
print(classification_report(y_test, y_pred, target_names=['BARBUNYA', 'BOMBAY', 'CALI', 'DERMASON', 'HOROZ', 'SEKER', 'SIRA']))
cm = confusion_matrix(y_test, y_pred, labels=class_names)
plot_confusion_matrix(y_test, y_pred, class_names)
#without SIRA
class_names = ['BARBUNYA', 'BOMBAY', 'CALI', 'DERMASON', 'HOROZ', 'SEKER']
data = data[data['Class'] != 'SIRA']
X_s = data.drop('Class', axis=1)
y_s = data['Class']
X_s_train, X_s_val, y_s_train, y_s_val = train_test_split(X_s, y_s, test_size=0.3, random_state=42)
y_s_val = labelencoder.fit_transform(y_s_val)
y_s_train = labelencoder.fit_transform(y_s_train)
X_s_train = X_s_train.drop(['Compactness','EquivDiameter','Area'], axis=1)
X_s_val = X_s_val.drop(['Compactness','EquivDiameter','Area'], axis=1)
X_s_train = scaling.fit_transform(X_s_train)
X_s_val = scaling.transform(X_s_val)
X_s_train = pd.DataFrame(X_s_train)
X_s_val = pd.DataFrame(X_s_val)
kn_s = KNeighborsClassifier(n_neighbors=8)
kn_s.fit(X_s_train, y_s_train)
y_s_pred = kn_s.predict(X_s_val)
print(accuracy_score(y_s_val, y_s_pred))
print(classification_report(y_s_val, y_s_pred))